To begin with, we need to import necessary python packages.
import numpy as np
import pandas as pd
import statsmodels.api
import scipy.stats
import bokeh.io
import bokeh.plotting
from Bio import Phylo
import io
import cmdstanpy
import arviz as az
import bebi103
bokeh.io.output_notebook()
We can now read in the data into a dataframe for analyis.
df = pd.read_csv("./20190322_supp_table_2.csv")
We take a look at the format for the data.
df['species_underscore'] = [spec.replace(" ", "_") for spec in df['species']]
df.head()
For some of this analysis, we will look at the per-species averages for our measurements. To get this, we use a simple aggregate function on the dataframe and take a look at the results.
df_averages = df.groupby(['species', 'species_underscore', 'spiracle'], as_index=False).aggregate(np.average)
df_averages['subfamily'] = df.groupby(['species', 'species_underscore', 'spiracle'], as_index=False).aggregate(max)['subfamily']
df_averages.head()
species_per_subfam=df_averages.groupby(['subfamily', 'spiracle'], as_index=False).count().groupby('subfamily').aggregate(max).reset_index()[['subfamily', 'species']]
species_per_subfam.columns = ('subfamily', 'subfam_count')
species_per_subfam
df_averages = df_averages.merge(species_per_subfam, on='subfamily')
For our plots, we will log transform the data. We will add a column to the dataframe with the log transformed data.
df_averages.head()
df_averages['log area (mm^2)'] = np.log10(df_averages['area (mm^2)'])
df_averages['log dist'] = np.log10(df_averages['depth (mm)'])
df_averages['log mass (g)'] = np.log10(df_averages['mass (g)'])
df_averages['log area/dist'] = np.log10(df_averages['area (mm^2)']/df_averages['depth (mm)'])
df_averages['log area^2/dist'] = np.log10(df_averages['area (mm^2)']**2/df_averages['depth (mm)'])
df_averages.head()
In addition to log transforming the species averaged data, we will do the same for the whole data set.
df['log area (mm^2)'] = np.log10(df['area (mm^2)'])
df['log dist'] = np.log10(df['depth (mm)'])
df['log mass (g)'] = np.log10(df['mass (g)'])
df['log area/dist'] = np.log10(df['area (mm^2)']/df['depth (mm)'])
df['log area^2/dist'] = np.log10(df['area (mm^2)']**2/df['depth (mm)'])
df.head()
tree = Phylo.read(io.StringIO("(((Cyclocephala_borealis:1.0,(Dynastes_hercules:1.0,Trypoxylus_dichotomus:1.0):1.0):1.0,Popilia_japonica:1.0):1.0,(Protaetia_orientalis:1.0,((((Coelorrhina_hornimani:1.0,Mecynorrhina_torquata:1.0):1,Dicronorrhina_derbyana:1):1,Eudicella_euthalia:1.0):1,Goliathus_goliathus:1.0):1.0):1.0):0.0;"),
"newick")
Phylo.draw(tree)
species = df.sort_values('mass (g)')['species_underscore'].unique()
cov = np.zeros(shape=(len(species), len(species)))
comp_matrix = np.zeros(shape=(len(species), len(species))).astype(str)
for i, spec in enumerate(species):
for j, comp in enumerate(species):
if i == j:
cov[i, j] = 6.0
continue
cov[i, j] = tree.depths()[tree.common_ancestor(spec, comp)]
comp_matrix[i, j] = spec + ':' + comp
cov
print(pd.DataFrame(cov))
pd.DataFrame(comp_matrix)
df_averages.loc[df_averages['spiracle'] == '1']
With our data in this format, we can build the PGLS model for our regression. To do this, we will write a statistical model for the data. We choose a multivariate normal distribution with covariance matrix given by the above matrix adjusted by the parameter $\lambda$ (as in PGLS), and put a prior on $\lambda$. This gives
\begin{align} f(x_i; a, b) &= ax_i + b \\ b &\sim \text{Norm}(0, 1) \\ a &\sim \text{Norm}(0, 2) \\ \lambda &\sim \text{beta}(1.1, 1.1) \\ y_i &\sim \text{multi_normal}\left(f_\mu(x_i; a, b), f_\sigma(\mathrm{cov}, \lambda)\right) \end{align}Time for a prior predictive check on our model!
sm_prior_pred = cmdstanpy.CmdStanModel(
stan_file="spiracle_pgls_prior_predictive.stan"
)
x = df_averages.sort_values('mass (g)').loc[df_averages['spiracle'] == '1', 'log mass (g)'].values
data = {
"N": len(x),
"x": x,
"a_mu": 0.66,
"a_sig": 0.3,
"b_mu": -1.0,
"b_sig": 1.0,
"lambda_alpha": 1.1,
"lambda_beta": 1.1,
"cov_phylo":cov,
}
samples = sm_prior_pred.sample(data=data, sampling_iters=1000, fixed_param=True)
samples = az.from_cmdstanpy(posterior=samples, prior=samples, prior_predictive=['y'])
bokeh.io.show(
bebi103.viz.predictive_regression(
samples.prior_predictive['y'],
samples_x=x,
percentiles=[30, 60, 90, 99],
x_axis_label='mass',
y_axis_label='area',
)
)
p = bokeh.plotting.figure()
p.circle('log mass (g)', 'log area (mm^2)', source=df_averages.loc[df_averages['spiracle'] == '1'])
bokeh.io.show(p)
size = 1000
lines = [a*x+b+sample for a, b, sample in zip(np.random.normal(0,2, size=size), np.random.normal(0,1,size=size),
np.random.multivariate_normal(x, cov, size=size))]#+np.random.normal(0, 3, size=(100,) ))]
p = bokeh.plotting.figure()
[p.line(x, y, alpha=0.2) for y in lines]
bokeh.io.show(p)
species = df.sort_values('mass (g)')['species_underscore'].unique()
cov = np.zeros(shape=(len(species), len(species)))
comp_matrix = np.zeros(shape=(len(species), len(species))).astype(str)
for i, spec in enumerate(species):
for j, comp in enumerate(species):
#if i == j:
# cov[i, j] = 6.0
# continue
cov[i, j] = tree.depths()[tree.common_ancestor(spec, comp)]
comp_matrix[i, j] = spec + ':' + comp
size = 1000
lines = [a*x+b+sample for a, b, sample in zip(np.random.normal(0,2, size=size), np.random.normal(0,1,size=size),
np.random.multivariate_normal(x, cov, size=size))]#+np.random.normal(0, 3, size=(100,) ))]
p = bokeh.plotting.figure()
[p.line(x, y, alpha=0.2) for y in lines]
bokeh.io.show(p)
cov
plots = []
sample = np.random.multivariate_normal(x, cov, size=1000)
num = df_averages.sort_values('mass (g)').loc[df_averages.sort_values('mass (g)')['spiracle']=='1', 'subfam_count'].values
for i in range(len(sample[0])):
for j in range(len(sample[0]))[i:-1]:
p = bokeh.plotting.figure(plot_width=200, plot_height=200, x_range=(-5, 5), y_range=(-5, 5), title='comp of ' +str(i)+' and '+str(j)+' | '+str(num[i])+','+str(num[j]))
p.circle(sample[:, i], sample[:, j], alpha=0.1)
plots.append(p)
bokeh.io.show(bokeh.layouts.gridplot(plots, ncols=4))
p = bokeh.plotting.figure()
[p.circle(x, sample, alpha=0.05) for sample in np.random.multivariate_normal(x, cov, size=1000)]
bokeh.io.show(p)
print(df_averages.sort_values('mass (g)').loc[df_averages.sort_values('mass (g)')['spiracle']=='1', 'subfam_count'].values)
Phylo.draw(tree)
species = df.sort_values('mass (g)')['species_underscore'].unique()
cov = np.zeros(shape=(len(species), len(species)))
comp_matrix = np.zeros(shape=(len(species), len(species))).astype(str)
for i, spec in enumerate(species):
for j, comp in enumerate(species):
if i == j:
cov[i, j] = 6.0
continue
cov[i, j] = tree.depths()[tree.common_ancestor(spec, comp)]
comp_matrix[i, j] = spec + ':' + comp
cov*1/1000
x = df_averages.sort_values('mass (g)').loc[df_averages['spiracle'] == 'S', 'log mass (g)'].values
y = df_averages.sort_values('mass (g)').loc[df_averages['spiracle'] == 'S', 'log area/dist'].values
data = {
"N": len(x),
"x": x,
"y": y,
"cov_phylo":cov,
}
sm = cmdstanpy.CmdStanModel(stan_file='spiracle_pgls_model.stan')
samples = sm.sample(data=data, sampling_iters=1000, chains=4)
samples = az.from_cmdstanpy(posterior=samples, posterior_predictive=["y_ppc"])
bebi103.stan.check_all_diagnostics(samples, var_names=['b', 'a'])
bokeh.io.show(
bebi103.viz.corner(
samples, pars=["a", "b", "lambda", "sigma"], xtick_label_orientation=np.pi / 4
)
)
p = bokeh.plotting.figure()
norm = np.linspace(-1, 1.5, 200)
bebi103.viz.ecdf(samples.posterior['a'].values.ravel(), color='red', p=p)
p.line(norm, scipy.stats.norm.cdf(norm, 0.33, 0.3))
bokeh.io.show(p)
az.summary(samples, var_names=['b', 'a', 'lambda'])
az.summary(samples, var_names=['b', 'a'])['mean']['a'] + az.summary(samples, var_names=['b', 'a'])['sd']['a']*1.96
az.summary(samples, var_names=['b', 'a'])['mean']['a'] - az.summary(samples, var_names=['b', 'a'])['sd']['a']*1.96
bokeh.io.show(bebi103.viz.ecdf(samples.posterior['a'].values.ravel()-0.333))
np.sum(samples.posterior['a'].values.ravel()-0.333 < 0)/len(samples.posterior['a'].values.ravel())
y_ppc = samples.posterior_predictive['y_ppc'].stack(
{"sample": ("chain", "draw")}
).transpose("sample", "y_ppc_dim_0")
bokeh.io.show(
bebi103.viz.predictive_regression(
y_ppc,
samples_x=x,
percentiles=[30, 50, 70, 95],
data=np.vstack((x, y)).transpose(),
x_axis_label='ant density, k [ant/cm^2]',
y_axis_label='ant flow, q [ant/cm/s]',
x_range=[x.min()-0.1, x.max()+0.1],
)
)
p = bokeh.plotting.figure(x_axis_label='log mass', y_axis_label='log area')
Y = df.sort_values('mass (g)').loc[df['spiracle'] == '2', 'log area/dist'].values
X = df.sort_values('mass (g)').loc[df['spiracle'] == '2', 'log mass (g)'].values
X = statsmodels.api.add_constant(X)
model = statsmodels.api.OLS(Y,X)
results = model.fit()
results.params
p.circle(df.sort_values('mass (g)').loc[df['spiracle'] == '2', 'log mass (g)'].values,
df.sort_values('mass (g)').loc[df['spiracle'] == '2', 'log area/dist'].values, color='black', size=15, alpha=0.3, fill_color='green')
p.line(np.linspace(x.min(), x.max(), 200),
results.params[1]*np.linspace(x.min(), x.max(), 200) + results.params[0],
line_width=5, line_dash='dashed', color='orange', alpha=0.3)
Y = y
X = x
X = statsmodels.api.add_constant(X)
model = statsmodels.api.OLS(Y,X)
results = model.fit()
results.params
p.line(np.linspace(x.min(), x.max(), 200),
az.summary(samples, var_names=['b', 'a'])['mean']['a']*np.linspace(x.min(), x.max(), 200) + az.summary(samples, var_names=['b', 'a'])['mean']['b'],
line_width=5, color='black')
p.line(np.linspace(x.min(), x.max(), 200),
results.params[1]*np.linspace(x.min(), x.max(), 200) + results.params[0],
line_width=5, line_dash='dashed', color='grey')
p.circle(x, y, color='black', size=15, alpha=0.5, fill_color='red')
bokeh.io.show(p)
p = bokeh.plotting.figure(x_axis_label='log mass', y_axis_label='log area')
p.circle(df.sort_values('mass (g)').loc[df['spiracle'] == '1', 'log mass (g)'].values,
df.sort_values('mass (g)').loc[df['spiracle'] == '1', 'log area (mm^2)'].values, color='black', size=15, alpha=0.8, fill_color='red')
bokeh.io.show(p)
species = df.sort_values('mass (g)')['species_underscore'].unique()
cov = np.zeros(shape=(len(species), len(species)))
comp_matrix = np.zeros(shape=(len(species), len(species))).astype(str)
for i, spec in enumerate(species):
for j, comp in enumerate(species):
comp_matrix[i, j] = spec + ':' + comp
if i == j:
cov[i, j] = tree.depths()[tree.common_ancestor(spec, comp)]
continue
cov[i, j] = tree.depths()[tree.common_ancestor(spec, comp)]
cov
cov=np.linalg.inv(cov)
print(pd.DataFrame(np.round(cov, decimals=3)))
x = df_averages.sort_values('mass (g)').loc[df_averages['spiracle'] == '3', 'log mass (g)'].values
y = df_averages.sort_values('mass (g)').loc[df_averages['spiracle'] == '3', 'log area (mm^2)'].values
data = {
"N": len(x),
"x": x,
"y": y,
"cov_phylo":cov,
}
sm = cmdstanpy.CmdStanModel(stan_file='spiracle_pgls_model.stan')
samples = sm.sample(data=data, sampling_iters=1000, chains=4)
samples = az.from_cmdstanpy(posterior=samples, posterior_predictive=["y_ppc"])
bebi103.stan.check_all_diagnostics(samples, var_names=['b', 'a', 'lambda'])
bokeh.io.show(
bebi103.viz.corner(
samples, pars=["a", "b", "lambda"], xtick_label_orientation=np.pi / 4
)
)
y_ppc = samples.posterior_predictive['y_ppc'].stack(
{"sample": ("chain", "draw")}
).transpose("sample", "y_ppc_dim_0")
bokeh.io.show(
bebi103.viz.predictive_regression(
y_ppc,
samples_x=x,
percentiles=[30, 50, 70, 99],
data=np.vstack((x, y)).transpose(),
x_axis_label='ant density, k [ant/cm^2]',
y_axis_label='ant flow, q [ant/cm/s]',
x_range=[x.min()-0.1, x.max()+0.1],
)
)
p = bokeh.plotting.figure()
p.line(np.linspace(-1, 1.5, 200), np.linspace(-1, 1.5, 200)*0.7672-1.4551)
p.line(np.linspace(-1, 1.5, 200), np.linspace(-1, 1.5, 200)*0.5887-1.3282, color='red')
p.line(np.linspace(-1, 1.5, 200), np.linspace(-1, 1.5, 200)*az.summary(samples, var_names=['b', 'a', 'lambda'])['mean'][1]
+ az.summary(samples, var_names=['b', 'a', 'lambda'])['mean'][0], color='black')
p.circle(x, y, size=15)
bokeh.io.show(p)
az.summary(samples, var_names=['b', 'a', 'lambda'])
library(nlme)
cat("((((Homo:0.21,Pongo:0.21):0.28,",
"Macaca:0.49):0.13,Ateles:0.62):0.38,Galago:1.00);",
file = "ex.tre", sep = "\n")
tree.primates <- read.tree("ex.tre")
X <- c(4.09434, 3.61092, 2.37024, 2.02815, -1.46968)
Y <- c(4.74493, 3.33220, 3.36730, 2.89037, 2.30259)
unlink("ex.tre") # delete the file "ex.tre"
m1 <- gls(Y ~ X, correlation=corBrownian(1, tree.primates))
corMatrix(m1$modelStruct$corStruct)
summary(m1)
m2 <- gls(Y ~ X, correlation=corMartins(1, tree.primates))
summary(m2)
corMatrix(m2$modelStruct$corStruct)
m3 <- gls(Y ~ X, correlation=corGrafen(1, tree.primates))
summary(m3)
corMatrix(m3$modelStruct$corStruct)
tree = Phylo.read(io.StringIO("((((Homo:0.21,Pongo:0.21):0.28,Macaca:0.49):0.13,Ateles:0.62):0.38,Galago:1.00);"),
"newick")
Phylo.draw(tree)
species = ["Homo","Pongo","Macaca","Ateles","Galago"]
cov = np.zeros(shape=(len(species), len(species)))
comp_matrix = np.zeros(shape=(len(species), len(species))).astype(str)
for i, spec in enumerate(species):
for j, comp in enumerate(species):
comp_matrix[i, j] = spec + ':' + comp
if i == j:
cov[i, j] = tree.depths()[tree.common_ancestor(spec, comp)]
continue
cov[i, j] = tree.depths()[tree.common_ancestor(spec, comp)]
pd.DataFrame(cov,index=species, columns=species)
X = np.array((4.09434, 3.61092, 2.37024, 2.02815, -1.46968))
Y = np.array((4.74493, 3.33220, 3.36730, 2.89037, 2.30259))
p = bokeh.plotting.figure()
p.circle(X, Y)
bokeh.io.show(p)
data = {
"N": len(X),
"x": X,
"y": Y,
"cov_phylo":cov,
}
sm = cmdstanpy.CmdStanModel(stan_file='spiracle_pgls_model.stan')
samples = sm.sample(data=data, sampling_iters=1000, chains=4)
samples = az.from_cmdstanpy(posterior=samples, posterior_predictive=["y_ppc"])
bebi103.stan.check_all_diagnostics(samples, var_names=['b', 'a'])
bokeh.io.show(
bebi103.viz.corner(
samples, pars=["a", "b"], xtick_label_orientation=np.pi / 4
)
)
az.summary(samples, ['a', 'b'])
np.std(np.array(samples.posterior['a']))
0.7754516*np.sqrt(5)
y_ppc = samples.posterior_predictive['y_ppc'].stack(
{"sample": ("chain", "draw")}
).transpose("sample", "y_ppc_dim_0")
bokeh.io.show(
bebi103.viz.predictive_regression(
y_ppc,
samples_x=X,
percentiles=[30, 50, 70, 99],
data=np.vstack((X, Y)).transpose(),
x_axis_label='ant density, k [ant/cm^2]',
y_axis_label='ant flow, q [ant/cm/s]',
x_range=[X.min()-0.1,X.max()+0.1],
)
)
bebi103.stan.clean_cmdstan()
cov=[[3.0, 2.0, 1.0, 1.0, 0.0],
[2.0, 3.0, 1.0, 1.0, 0.0],
[1.0, 1.0, 3.0, 2.0, 0.0],
[1.0, 1.0, 2.0, 3.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 3.0]]
cov=np.array([[3.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 3.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 3.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 3.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 3.0]])*1/10
x=np.array([1.02, 1.06, 0.96, 0.92, 0.89])
y=np.array([1.38, 1.41, 1.36, 1.22, 1.13])
p = bokeh.plotting.figure()
p.circle(x, y)
bokeh.io.show(p)
data = {
"N": len(x),
"x": x,
"y": y,
"cov_phylo":cov,
}
sm = cmdstanpy.CmdStanModel(stan_file='spiracle_pgls_model.stan')
samples = sm.sample(data=data, sampling_iters=1000, chains=4, adapt_delta=0.99)
samples = az.from_cmdstanpy(posterior=samples, posterior_predictive=["y_ppc"])
bebi103.stan.check_all_diagnostics(samples, var_names=['b', 'a', 'lambda'])
bokeh.io.show(
bebi103.viz.corner(
samples, pars=["a", "b", "lambda"], xtick_label_orientation=np.pi / 4
)
)
az.summary(samples, ['a', 'b', 'lambda'])
y_ppc = samples.posterior_predictive['y_ppc'].stack(
{"sample": ("chain", "draw")}
).transpose("sample", "y_ppc_dim_0")
bokeh.io.show(
bebi103.viz.predictive_regression(
y_ppc,
samples_x=x,
percentiles=[30, 50, 70, 99],
data=np.vstack((x, y)).transpose(),
x_axis_label='ant density, k [ant/cm^2]',
y_axis_label='ant flow, q [ant/cm/s]',
x_range=[x.min()-0.1,x.max()+0.1],
)
)